Here we look at the overlap between the following methods:
# only polyA results
polyA_sets <- hits %>%
filter(Kit == "polyA") %>%
distinct(gene_id) %>%
arrange(gene_id) %>%
mutate(value = 1,
var = paste0("superintronic:", gsub("-", "", CellLine),"_", Kit)) %>%
ungroup() %>%
plyranges::select(gene_id, var, value) %>%
tidyr::spread(var, value) %>%
dplyr::mutate_at(2:3, .funs = ~ dplyr::if_else(is.na(.), 0L, 1L)) %>%
left_join(parts_tbl)
## Joining, by = "gene_id"
totalRNA_sets <- hits %>%
filter(Kit == "total-RNA") %>%
distinct(gene_id) %>%
arrange(gene_id) %>%
mutate(value = 1,
var = paste0("superintronic:", gsub("-", "", CellLine),"_", Kit)) %>%
ungroup() %>%
plyranges::select(gene_id, var, value) %>%
tidyr::spread(var, value) %>%
dplyr::mutate_at(2:3, .funs = ~ dplyr::if_else(is.na(.), 0L, 1L)) %>%
left_join(parts_tbl)
## Joining, by = "gene_id"
totalRNA_sets
## # A tibble: 60 x 4
## gene_id `superintronic:HCC287_to… `superintronic:NCIH1975_t… gene_name
## <chr> <int> <int> <chr>
## 1 ENSG000000058… 1 1 ITGA3
## 2 ENSG000000383… 1 1 TRIO
## 3 ENSG000000504… 0 1 LETMD1
## 4 ENSG000000559… 1 0 PUM2
## 5 ENSG000000672… 1 1 PKM
## 6 ENSG000000716… 0 1 DAZAP1
## 7 ENSG000000963… 1 1 HSP90AB1
## 8 ENSG000001008… 0 1 DHRS2
## 9 ENSG000001012… 0 1 CDC25B
## 10 ENSG000001021… 1 0 PGK1
## # … with 50 more rows
irf_ac <- ir_finder_ac %>%
left_join(parts_tbl, by = "gene_name") %>%
select(gene_id = gene_id.y, gene_name, adj_pval = p_diff) %>%
mutate(`IRFinder_AC` = 1L)
irf_glm <- ir_finder_glm %>%
group_by(gene_name, gene_id) %>%
tidyr::nest() %>%
left_join(parts_tbl, by = "gene_name") %>%
left_join(parts_tbl %>%
mutate(gene_id2 = stringr::str_replace(gene_id, "\\.[1-9]{1,}", "")),
by = c("gene_id.x" = "gene_id2")) %>%
group_by(gene_name.x) %>%
mutate(canon_gene_id = list(unique(c(gene_id, gene_id.y))),
canon_gene_id = lapply(canon_gene_id, function(x) {
if (any(!is.na(x))) return(x[!is.na(x)])
return(NA_character_)
}),
canon_gene_id = as.character(unlist(canon_gene_id))) %>%
ungroup() %>%
select(gene_name = gene_name.y, gene_id = canon_gene_id, data) %>%
filter(!is.na(gene_id)) %>%
mutate(`IRFinder_GLM` = 1L)
irf_glm
## # A tibble: 303 x 4
## gene_name gene_id data IRFinder_GLM
## <chr> <chr> <list> <int>
## 1 ACADVL ENSG00000072778.19 <tibble [12 × 2]> 1
## 2 AC008982.1 ENSG00000268083.5 <tibble [7 × 2]> 1
## 3 CDKN2A ENSG00000147889.17 <tibble [1 × 2]> 1
## 4 HNRNPL ENSG00000104824.16 <tibble [2 × 2]> 1
## 5 CPNE1 ENSG00000214078.12 <tibble [3 × 2]> 1
## 6 NOP56 ENSG00000101361.16 <tibble [3 × 2]> 1
## 7 VCL ENSG00000035403.17 <tibble [2 × 2]> 1
## 8 STAG3L5P-PVRIG2P-PILRB ENSG00000272752.6 <tibble [1 × 2]> 1
## 9 CD44 ENSG00000026508.17 <tibble [5 × 2]> 1
## 10 GLIPR1 ENSG00000139278.9 <tibble [2 × 2]> 1
## # … with 293 more rows
isa_sets <- isa_hits %>%
mutate(`IsoformSwitchAnalyzeR` = 1L) %>%
select(gene_id,
`IsoformSwitchAnalyzeR`,
gene_name,
gene_switch_q_value,
rank = Rank)
isa_sets
## # A tibble: 257 x 5
## gene_id IsoformSwitchAnalyzeR gene_name gene_switch_q_value rank
## <chr> <int> <chr> <dbl> <int>
## 1 ENSG00000234745.10 1 HLA-B 0. 1
## 2 ENSG00000104823.8 1 ECH1 5.95e-210 2
## 3 ENSG00000115641.18 1 FHL2 3.63e-149 3
## 4 ENSG00000206503.12 1 HLA-A 6.83e-141 4
## 5 ENSG00000170291.14 1 ELP5 1.40e-120 5
## 6 ENSG00000008513.14 1 ST3GAL1 2.18e- 59 6
## 7 ENSG00000155324.9 1 GRAMD2B 9.48e- 57 7
## 8 ENSG00000114268.11 1 PFKFB4 2.79e- 54 8
## 9 ENSG00000268043.7 1 NBPF12 1.47e- 52 9
## 10 ENSG00000145736.14 1 GTF2H2 1.14e- 39 10
## # … with 247 more rows
library(index)
## Loading required package: limma
## Loading required package: edgeR
exon <- readRDS(system.file("extdata/exon_dge.Rds", package = "index"))
intron <- readRDS(system.file("extdata/intron_dge.Rds", package = "index"))
group <- readRDS(system.file("extdata/group.Rds", package = "index"))
x <- index_analysis(exon, intron, group, p.value = 0.01)
## creating design matrix by 'model.matrix(~group)'
index_res <- x$tops$intron %>%
filter(adj.P.Val < 0.01) %>%
select(gene_id = GeneID, index_pval = adj.P.Val, logfc = logFC) %>%
mutate(`index` = 1L) %>%
as_tibble()
index_res
## # A tibble: 5,930 x 4
## gene_id index_pval logfc index
## <fct> <dbl> <dbl> <int>
## 1 ENSG00000000003.14 0.00372 -0.995 1
## 2 ENSG00000000457.13 0.00132 -0.487 1
## 3 ENSG00000000460.16 0.000342 -0.804 1
## 4 ENSG00000000971.15 0.0000389 -1.31 1
## 5 ENSG00000001084.10 0.00161 -0.579 1
## 6 ENSG00000001461.16 0.0000205 -1.29 1
## 7 ENSG00000001497.16 0.000567 -0.726 1
## 8 ENSG00000001561.6 0.000316 -2.32 1
## 9 ENSG00000001617.11 0.0000168 4.80 1
## 10 ENSG00000001631.15 0.000621 0.484 1
## # … with 5,920 more rows
For the methods that provide a statistical test (not superintronic), we have taken the genes with a differential intronic region at an FDR of 0.05.
Note that these comparisons are not quite fair since the gene sets selected by each method after filtering are slighty different. Nonetheless we can visualise the overlap using an UpSet plot.
All hits discovered by superintronic except for 19 genes are obtained by the index analysis.
library(UpSetR)
dex_set <- index_res %>%
full_join(totalRNA_sets) %>%
mutate_at(vars(index, starts_with("superintronic")),
.funs = ~ if_else(is.na(.), 0L, .))
## Joining, by = "gene_id"
## Warning: Column `gene_id` joining factor and character vector, coercing into
## character vector
upset(as.data.frame(dex_set))
How many genes in DEList overlap with HCC287 total RNA = 13 How many genes in DEList overlap with NCIH1975 total RNA = 16 How many genes overlap between the overlapping genes = 12.
superintronic detects genes with IR (IR-like profiles). If a gene is detected in Group A and not B, we can loosely say that it has differential IR (DIR) in Group A and B. Similarly for Group B.
Technically, a gene can be detected with IR in both Group A and B, but at different evels thus DIR – these genes are commonly detected in Group A and B, and are DE (see above).
ir_sets <- isa_sets %>%
full_join(polyA_sets) %>%
full_join(irf_ac) %>%
full_join(irf_glm) %>%
mutate_at(vars(
starts_with("Isoform"),
starts_with("superintronic"),
starts_with("IRFinder")),
.funs = ~ if_else(is.na(.), 0L, .))
## Joining, by = c("gene_id", "gene_name")
## Joining, by = c("gene_id", "gene_name")
## Joining, by = c("gene_id", "gene_name")
plot_sets <- ir_sets %>%
select(gene_id,
starts_with("Isoform"),
starts_with("superintronic"),
starts_with("IRFinder"))
upset(as.data.frame(plot_sets))
How many unique genes to HCC287 poly(A) overlap with IRFinder?
0 from AC test; 8 from GLM test
How many unique genes to NCIH1975 poly(A) overlap with IRFinder?
0 from AC test; 11 from GLM test
How many unique genes to HCC287 poly(A) overlap with ISA? 0
How many unique genes to NCIH1975 poly(A) overlap with ISA? 0
Are there any common genes to both polyA celllines, that are also DE in intron counts (any category), that overlap with IRFinder? Yes, see below, there are three genes.
de_sets <- index_res %>%
full_join(polyA_sets) %>%
full_join(irf_glm) %>%
mutate_at(vars(index,
starts_with("superintronic"),
starts_with("IRFinder")),
.funs = ~ if_else(is.na(.), 0L, .))
## Joining, by = "gene_id"
## Warning: Column `gene_id` joining factor and character vector, coercing into
## character vector
## Joining, by = c("gene_id", "gene_name")
de_sets %>%
select(gene_id, index, starts_with("superintronic"), starts_with("IRFinder")) %>%
as.data.frame() %>%
upset()
Are there any common genes to both poly(A) celllines, that are also DE in intron counts (any category), that overlap with ISA? No.Â
Results are mostly stable accross libraries for the cellline mixtures.
libraries <- totalRNA_sets %>%
full_join(polyA_sets) %>%
mutate_at(vars(starts_with("superintronic")),
.funs = ~ if_else(is.na(.), 0L, .))
## Joining, by = c("gene_id", "gene_name")
libraries %>%
select(gene_id, starts_with("superintronic")) %>%
as.data.frame() %>%
upset()